From 4ce9d9d80e937f5334a389d028a84870ba93e477 Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Tue, 29 Mar 2005 21:10:08 +0000 Subject: [PATCH] bitkeeper revision 1.1236.1.154 (4249c430s6iKHaP4AAIWnJQScN1CyA) Fix lazy state switching when context-switching to/from the idle domain. Track which domain's state is on each CPU and, for each domain, which CPUs are running on its page tables. Signed-off-by: Keir Fraser --- xen/arch/ia64/xenmisc.c | 2 +- xen/arch/x86/domain.c | 187 ++++++++++++++++---------- xen/arch/x86/domain_build.c | 4 +- xen/arch/x86/mm.c | 13 +- xen/arch/x86/shadow.c | 1 - xen/arch/x86/smp.c | 252 ++++++++++++++---------------------- xen/arch/x86/x86_32/mm.c | 16 --- xen/arch/x86/x86_64/mm.c | 17 --- xen/common/dom0_ops.c | 1 - xen/common/page_alloc.c | 7 +- xen/common/schedule.c | 1 - xen/include/asm-x86/mm.h | 6 - xen/include/public/xen.h | 28 ++-- xen/include/xen/sched.h | 38 ++++-- xen/include/xen/smp.h | 9 +- 15 files changed, 275 insertions(+), 307 deletions(-) diff --git a/xen/arch/ia64/xenmisc.c b/xen/arch/ia64/xenmisc.c index 40055983db..2e4b436658 100644 --- a/xen/arch/ia64/xenmisc.c +++ b/xen/arch/ia64/xenmisc.c @@ -53,7 +53,7 @@ platform_is_hp_ski(void) } /* calls in xen/common code that are unused on ia64 */ -void synchronise_pagetables(unsigned long cpu_mask) { return; } +void synchronise_execution_state(unsigned long cpu_mask) { } int grant_table_create(struct domain *d) { return 0; } void grant_table_destroy(struct domain *d) diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index dbebab53d6..e26a509f2a 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -45,13 +45,18 @@ static int opt_noreboot = 0; boolean_param("noreboot", opt_noreboot); +struct percpu_ctxt { + struct exec_domain *curr_ed; +} __cacheline_aligned; +static struct percpu_ctxt percpu_ctxt[NR_CPUS]; + static void default_idle(void) { - __cli(); + local_irq_disable(); if ( !softirq_pending(smp_processor_id()) ) safe_halt(); else - __sti(); + local_irq_enable(); } static __attribute_used__ void idle_loop(void) @@ -73,6 +78,8 @@ void startup_cpu_idle_loop(void) { /* Just some sanity to ensure that the scheduler is set up okay. */ ASSERT(current->domain->id == IDLE_DOMAIN_ID); + percpu_ctxt[smp_processor_id()].curr_ed = current; + set_bit(smp_processor_id(), ¤t->domain->cpuset); domain_unpause_by_systemcontroller(current->domain); raise_softirq(SCHEDULE_SOFTIRQ); do_softirq(); @@ -110,7 +117,7 @@ void machine_restart(char * __unused) safe_halt(); } - __sti(); + local_irq_enable(); /* Ensure we are the boot CPU. */ if ( GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid ) @@ -307,10 +314,10 @@ unsigned long alloc_monitor_pagetable(struct exec_domain *ed) struct pfn_info *mmfn_info; struct domain *d = ed->domain; - ASSERT(!pagetable_val(ed->arch.monitor_table)); /* we should only get called once */ + ASSERT(pagetable_val(ed->arch.monitor_table) == 0); mmfn_info = alloc_domheap_page(NULL); - ASSERT( mmfn_info ); + ASSERT(mmfn_info != NULL); mmfn = (unsigned long) (mmfn_info - frame_table); mpl2e = (l2_pgentry_t *) map_domain_mem(mmfn << PAGE_SHIFT); @@ -326,7 +333,7 @@ unsigned long alloc_monitor_pagetable(struct exec_domain *ed) ed->arch.monitor_vtable = mpl2e; - // map the phys_to_machine map into the Read-Only MPT space for this domain + /* Map the p2m map into the Read-Only MPT space for this domain. */ mpl2e[l2_table_offset(RO_MPT_VIRT_START)] = mk_l2_pgentry(pagetable_val(ed->arch.phys_table) | __PAGE_HYPERVISOR); @@ -578,19 +585,10 @@ void toggle_guest_mode(struct exec_domain *ed) : "=r" (__r) : "r" (value), "0" (__r) );\ __r; }) -static void switch_segments( - struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n) +static void load_segments(struct exec_domain *p, struct exec_domain *n) { int all_segs_okay = 1; - if ( !is_idle_task(p->domain) ) - { - __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) ); - __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) ); - __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) ); - __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) ); - } - /* Either selector != 0 ==> reload. */ if ( unlikely(p->arch.user_ctxt.ds | n->arch.user_ctxt.ds) ) @@ -654,7 +652,8 @@ static void switch_segments( if ( unlikely(!all_segs_okay) ) { - unsigned long *rsp = + struct xen_regs *regs = get_execution_context(); + unsigned long *rsp = (n->arch.flags & TF_kernel_mode) ? (unsigned long *)regs->rsp : (unsigned long *)n->arch.kernel_sp; @@ -689,6 +688,24 @@ static void switch_segments( } } +static void save_segments(struct exec_domain *p) +{ + __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) ); + __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) ); + __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) ); + __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) ); +} + +static void clear_segments(void) +{ + __asm__ __volatile__ ( + "movl %0,%%ds; " + "movl %0,%%es; " + "movl %0,%%fs; " + "movl %0,%%gs; swapgs; movl %0,%%gs" + : : "r" (0) ); +} + long do_switch_to_user(void) { struct xen_regs *regs = get_execution_context(); @@ -720,80 +737,96 @@ long do_switch_to_user(void) #elif defined(__i386__) -#define switch_segments(_r, _p, _n) ((void)0) +#define load_segments(_p, _n) ((void)0) +#define save_segments(_p) ((void)0) +#define clear_segments() ((void)0) #endif -/* - * This special macro can be used to load a debugging register - */ #define loaddebug(_ed,_reg) \ - __asm__("mov %0,%%db" #_reg \ - : /* no output */ \ - :"r" ((_ed)->debugreg[_reg])) + __asm__ __volatile__ ("mov %0,%%db" #_reg : : "r" ((_ed)->debugreg[_reg])) -void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p) +static void __context_switch(void) { -#ifdef __i386__ - struct tss_struct *tss = init_tss + smp_processor_id(); -#endif execution_context_t *stack_ec = get_execution_context(); + unsigned int cpu = smp_processor_id(); + struct exec_domain *p = percpu_ctxt[cpu].curr_ed; + struct exec_domain *n = current; - __cli(); - - /* Switch guest general-register state. */ - if ( !is_idle_task(prev_p->domain) ) + if ( !is_idle_task(p->domain) ) { - memcpy(&prev_p->arch.user_ctxt, + memcpy(&p->arch.user_ctxt, stack_ec, sizeof(*stack_ec)); - unlazy_fpu(prev_p); - CLEAR_FAST_TRAP(&prev_p->arch); + unlazy_fpu(p); + CLEAR_FAST_TRAP(&p->arch); + save_segments(p); } - if ( !is_idle_task(next_p->domain) ) - { - memcpy(stack_ec, - &next_p->arch.user_ctxt, - sizeof(*stack_ec)); + memcpy(stack_ec, + &n->arch.user_ctxt, + sizeof(*stack_ec)); - /* Maybe switch the debug registers. */ - if ( unlikely(next_p->arch.debugreg[7]) ) - { - loaddebug(&next_p->arch, 0); - loaddebug(&next_p->arch, 1); - loaddebug(&next_p->arch, 2); - loaddebug(&next_p->arch, 3); - /* no 4 and 5 */ - loaddebug(&next_p->arch, 6); - loaddebug(&next_p->arch, 7); - } + /* Maybe switch the debug registers. */ + if ( unlikely(n->arch.debugreg[7]) ) + { + loaddebug(&n->arch, 0); + loaddebug(&n->arch, 1); + loaddebug(&n->arch, 2); + loaddebug(&n->arch, 3); + /* no 4 and 5 */ + loaddebug(&n->arch, 6); + loaddebug(&n->arch, 7); + } - if ( !VMX_DOMAIN(next_p) ) - { - SET_FAST_TRAP(&next_p->arch); + if ( !VMX_DOMAIN(n) ) + { + SET_FAST_TRAP(&n->arch); #ifdef __i386__ + { /* Switch the kernel ring-1 stack. */ - tss->esp1 = next_p->arch.kernel_sp; - tss->ss1 = next_p->arch.kernel_ss; -#endif + struct tss_struct *tss = &init_tss[cpu]; + tss->esp1 = n->arch.kernel_sp; + tss->ss1 = n->arch.kernel_ss; } - - /* Switch page tables. */ - write_ptbase(next_p); +#endif } - set_current(next_p); + set_bit(cpu, &n->domain->cpuset); + write_ptbase(n); + clear_bit(cpu, &p->domain->cpuset); - __asm__ __volatile__ ("lgdt %0" : "=m" (*next_p->arch.gdt)); + __asm__ __volatile__ ( "lgdt %0" : "=m" (*n->arch.gdt) ); + + percpu_ctxt[cpu].curr_ed = n; +} - __sti(); - if ( !VMX_DOMAIN(next_p) ) +void context_switch(struct exec_domain *prev, struct exec_domain *next) +{ + struct exec_domain *realprev; + + local_irq_disable(); + + set_current(next); + + if ( ((realprev = percpu_ctxt[smp_processor_id()]. curr_ed) == next) || + is_idle_task(next->domain) ) { - load_LDT(next_p); - switch_segments(stack_ec, prev_p, next_p); + local_irq_enable(); + } + else + { + __context_switch(); + + local_irq_enable(); + + if ( !VMX_DOMAIN(next) ) + { + load_LDT(next); + load_segments(realprev, next); + } } /* @@ -802,13 +835,27 @@ void context_switch(struct exec_domain *prev_p, struct exec_domain *next_p) * 'prev' (after this point, a dying domain's info structure may be freed * without warning). */ - clear_bit(EDF_RUNNING, &prev_p->ed_flags); + clear_bit(EDF_RUNNING, &prev->ed_flags); - schedule_tail(next_p); + schedule_tail(next); BUG(); } +static void __synchronise_lazy_execstate(void *unused) +{ + if ( percpu_ctxt[smp_processor_id()].curr_ed != current ) + { + __context_switch(); + load_LDT(current); + clear_segments(); + } +} +void synchronise_lazy_execstate(unsigned long cpuset) +{ + smp_subset_call_function(__synchronise_lazy_execstate, NULL, 1, cpuset); +} + unsigned long __hypercall_create_continuation( unsigned int op, unsigned int nr_args, ...) { @@ -947,13 +994,11 @@ void domain_relinquish_memory(struct domain *d) { struct exec_domain *ed; - /* Ensure that noone is running over the dead domain's page tables. */ - synchronise_pagetables(~0UL); + BUG_ON(d->cpuset != 0); /* Release device mappings of other domains */ gnttab_release_dev_mappings( d->grant_table ); - /* Exit shadow mode before deconstructing final guest page table. */ shadow_mode_disable(d); diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index 54e5caa7e0..3ad344ce09 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -421,7 +421,7 @@ int construct_dom0(struct domain *d, update_pagetables(ed); /* Install the new page tables. */ - __cli(); + local_irq_disable(); write_ptbase(ed); /* Copy the OS image and free temporary buffer. */ @@ -498,7 +498,7 @@ int construct_dom0(struct domain *d, /* Reinstate the caller's page tables. */ write_ptbase(current); - __sti(); + local_irq_enable(); #if defined(__i386__) /* Destroy low mappings - they were only for our convenience. */ diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c index 0c8322bfc9..c76dd791bc 100644 --- a/xen/arch/x86/mm.c +++ b/xen/arch/x86/mm.c @@ -1147,16 +1147,13 @@ int get_page_type(struct pfn_info *page, u32 type) * may be unnecessary (e.g., page was GDT/LDT) but those * circumstances should be very rare. */ - struct exec_domain *ed; - unsigned long mask = 0; - for_each_exec_domain ( page_get_owner(page), ed ) - mask |= 1 << ed->processor; - mask = tlbflush_filter_cpuset(mask, page->tlbflush_timestamp); + unsigned long cpuset = tlbflush_filter_cpuset( + page_get_owner(page)->cpuset, page->tlbflush_timestamp); - if ( unlikely(mask != 0) ) + if ( unlikely(cpuset != 0) ) { perfc_incrc(need_flush_tlb_flush); - flush_tlb_mask(mask); + flush_tlb_mask(cpuset); } /* We lose existing type, back pointer, and validity. */ @@ -2842,7 +2839,7 @@ void audit_domain(struct domain *d) if ( d != current->domain ) domain_pause(d); - synchronise_pagetables(~0UL); + synchronise_lazy_execstate(~0UL); printk("pt base=%lx sh_info=%x\n", pagetable_val(d->exec_domain[0]->arch.guest_table)>>PAGE_SHIFT, diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 4e755d0597..e47eccf8ee 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -384,7 +384,6 @@ int shadow_mode_control(struct domain *d, dom0_shadow_control_t *sc) } domain_pause(d); - synchronise_pagetables(~0UL); shadow_lock(d); diff --git a/xen/arch/x86/smp.c b/xen/arch/x86/smp.c index fc7dfc42b5..962417238f 100644 --- a/xen/arch/x86/smp.c +++ b/xen/arch/x86/smp.c @@ -59,9 +59,7 @@ */ /* - * the following functions deal with sending IPIs between CPUs. - * - * We use 'broadcast', CPU->CPU IPIs and self-IPIs too. + * The following functions deal with sending IPIs between CPUs. */ static inline int __prepare_ICR (unsigned int shortcut, int vector) @@ -82,22 +80,22 @@ static inline void __send_IPI_shortcut(unsigned int shortcut, int vector) * of the value read we use an atomic rmw access to avoid costly * cli/sti. Otherwise we use an even cheaper single atomic write * to the APIC. - */ + */ unsigned int cfg; /* - * Wait for idle. - */ + * Wait for idle. + */ apic_wait_icr_idle(); /* - * No need to touch the target chip field - */ + * No need to touch the target chip field + */ cfg = __prepare_ICR(shortcut, vector); /* - * Send the IPI. The write to APIC_ICR fires this off. - */ + * Send the IPI. The write to APIC_ICR fires this off. + */ apic_write_around(APIC_ICR, cfg); } @@ -111,106 +109,44 @@ static inline void send_IPI_mask(int mask, int vector) unsigned long cfg; unsigned long flags; - __save_flags(flags); - __cli(); + local_irq_save(flags); - /* * Wait for idle. */ apic_wait_icr_idle(); - + /* * prepare target chip field */ cfg = __prepare_ICR2(mask); apic_write_around(APIC_ICR2, cfg); - + /* * program the ICR */ cfg = __prepare_ICR(0, vector); - + /* * Send the IPI. The write to APIC_ICR fires this off. */ apic_write_around(APIC_ICR, cfg); - __restore_flags(flags); + local_irq_restore(flags); } static inline void send_IPI_allbutself(int vector) { /* - * if there are no other CPUs in the system then - * we get an APIC send error if we try to broadcast. - * thus we have to avoid sending IPIs in this case. + * If there are no other CPUs in the system then we get an APIC send error + * if we try to broadcast. thus we have to avoid sending IPIs in this case. */ - if (!(smp_num_cpus > 1)) + if ( smp_num_cpus <= 1 ) return; __send_IPI_shortcut(APIC_DEST_ALLBUT, vector); } -/* - * ********* XEN NOTICE ********** - * I've left the following comments lying around as they look liek they might - * be useful to get multiprocessor guest OSes going. However, I suspect the - * issues we face will be quite different so I've ripped out all the - * TLBSTATE logic (I didn't understand it anyway :-). These comments do - * not apply to Xen, therefore! -- Keir (8th Oct 2003). - */ -/* - * Smarter SMP flushing macros. - * c/o Linus Torvalds. - * - * These mean you can really definitely utterly forget about - * writing to user space from interrupts. (Its not allowed anyway). - * - * Optimizations Manfred Spraul - * - * The flush IPI assumes that a thread switch happens in this order: - * [cpu0: the cpu that switches] - * 1) switch_mm() either 1a) or 1b) - * 1a) thread switch to a different mm - * 1a1) clear_bit(cpu, &old_mm.cpu_vm_mask); - * Stop ipi delivery for the old mm. This is not synchronized with - * the other cpus, but smp_invalidate_interrupt ignore flush ipis - * for the wrong mm, and in the worst case we perform a superflous - * tlb flush. - * 1a2) set cpu_tlbstate to TLBSTATE_OK - * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 - * was in lazy tlb mode. - * 1a3) update cpu_tlbstate[].active_mm - * Now cpu0 accepts tlb flushes for the new mm. - * 1a4) set_bit(cpu, &new_mm.cpu_vm_mask); - * Now the other cpus will send tlb flush ipis. - * 1a4) change cr3. - * 1b) thread switch without mm change - * cpu_tlbstate[].active_mm is correct, cpu0 already handles - * flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK - * 1b2) test_and_set the cpu bit in cpu_vm_mask. - * Atomically set the bit [other cpus will start sending flush ipis], - * and test the bit. - * 1b3) if the bit was 0: leave_mm was called, flush the tlb. - * 2) switch %%esp, ie current - * - * The interrupt must handle 2 special cases: - * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. - * - the cpu performs speculative tlb reads, i.e. even if the cpu only - * runs in kernel space, the cpu could load tlb entries for user space - * pages. - * - * The good news is that cpu_tlbstate is local to each cpu, no - * write/read ordering problems. - * - * TLB flush IPI: - * - * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. - * 2) Leave the mm if we are in the lazy tlb mode. - */ - static spinlock_t flush_lock = SPIN_LOCK_UNLOCKED; static unsigned long flush_cpumask; @@ -226,21 +162,19 @@ void flush_tlb_mask(unsigned long mask) { ASSERT(local_irq_is_enabled()); - if ( mask & (1 << smp_processor_id()) ) + if ( mask & (1UL << smp_processor_id()) ) { local_flush_tlb(); - mask &= ~(1 << smp_processor_id()); + mask &= ~(1UL << smp_processor_id()); } if ( mask != 0 ) { spin_lock(&flush_lock); - flush_cpumask = mask; send_IPI_mask(mask, INVALIDATE_TLB_VECTOR); while ( flush_cpumask != 0 ) cpu_relax(); - spin_unlock(&flush_lock); } } @@ -254,7 +188,8 @@ void new_tlbflush_clock_period(void) if ( smp_num_cpus > 1 ) { spin_lock(&flush_lock); - flush_cpumask = ((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id()); + flush_cpumask = (1UL << smp_num_cpus) - 1; + flush_cpumask &= ~(1UL << smp_processor_id()); send_IPI_allbutself(INVALIDATE_TLB_VECTOR); while ( flush_cpumask != 0 ) cpu_relax(); @@ -266,124 +201,138 @@ void new_tlbflush_clock_period(void) tlbflush_clock++; } -static void flush_tlb_all_pge_ipi(void* info) +static void flush_tlb_all_pge_ipi(void *info) { __flush_tlb_pge(); } void flush_tlb_all_pge(void) { - smp_call_function (flush_tlb_all_pge_ipi,0,1,1); + smp_call_function(flush_tlb_all_pge_ipi, 0, 1, 1); __flush_tlb_pge(); } void smp_send_event_check_mask(unsigned long cpu_mask) { - cpu_mask &= ~(1< The function to run. This must be fast and non-blocking. - * An arbitrary pointer to pass to the function. - * currently unused. - * If true, wait (atomically) until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. Does not return until - * remote CPUs are nearly ready to execute <> or are or have executed. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler, or bottom halfs. + * Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, spin until function has completed on other CPUs. + * Returns: 0 on success, else a negative status code. */ +int smp_call_function( + void (*func) (void *info), void *info, int unused, int wait) { struct call_data_struct data; - int cpus = smp_num_cpus-1; + unsigned long cpuset; + + ASSERT(local_irq_is_enabled()); - if (!cpus) + cpuset = ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id()); + if ( cpuset == 0 ) return 0; data.func = func; data.info = info; - atomic_set(&data.started, 0); + data.started = data.finished = 0; data.wait = wait; - if (wait) - atomic_set(&data.finished, 0); + + spin_lock(&call_lock); + + call_data = &data; + wmb(); + + send_IPI_allbutself(CALL_FUNCTION_VECTOR); + + while ( (wait ? data.finished : data.started) != cpuset ) + cpu_relax(); + + spin_unlock(&call_lock); + + return 0; +} + +/* Run a function on a subset of CPUs (may include local CPU). */ +int smp_subset_call_function( + void (*func) (void *info), void *info, int wait, unsigned long cpuset) +{ + struct call_data_struct data; ASSERT(local_irq_is_enabled()); + if ( cpuset & (1UL << smp_processor_id()) ) + { + local_irq_disable(); + (*func)(info); + local_irq_enable(); + } + + cpuset &= ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id()); + if ( cpuset == 0 ) + return 0; + + data.func = func; + data.info = info; + data.started = data.finished = 0; + data.wait = wait; + spin_lock(&call_lock); call_data = &data; wmb(); - /* Send a message to all other CPUs and wait for them to respond */ - send_IPI_allbutself(CALL_FUNCTION_VECTOR); - /* Wait for response */ - while (atomic_read(&data.started) != cpus) - barrier(); + send_IPI_mask(cpuset, CALL_FUNCTION_VECTOR); - if (wait) - while (atomic_read(&data.finished) != cpus) - barrier(); + while ( (wait ? data.finished : data.started) != cpuset ) + cpu_relax(); spin_unlock(&call_lock); return 0; } -static void stop_this_cpu (void * dummy) +static void stop_this_cpu (void *dummy) { - /* - * Remove this CPU: - */ clear_bit(smp_processor_id(), &cpu_online_map); - __cli(); + disable_local_APIC(); - for(;;) __asm__("hlt"); -} -/* - * this function calls the 'stop' function on all other CPUs in the system. - */ + for ( ; ; ) + __asm__ __volatile__ ( "hlt" ); +} void smp_send_stop(void) { + /* Stop all other CPUs in the system. */ smp_call_function(stop_this_cpu, NULL, 1, 0); smp_num_cpus = 1; - __cli(); + local_irq_disable(); disable_local_APIC(); - __sti(); + local_irq_enable(); } -/* - * Nothing to do, as all the work is done automatically when - * we return from the interrupt. - */ asmlinkage void smp_event_check_interrupt(void) { ack_APIC_irq(); @@ -394,23 +343,20 @@ asmlinkage void smp_call_function_interrupt(void) { void (*func) (void *info) = call_data->func; void *info = call_data->info; - int wait = call_data->wait; ack_APIC_irq(); perfc_incrc(ipis); - /* - * Notify initiating CPU that I've grabbed the data and am - * about to execute the function - */ - mb(); - atomic_inc(&call_data->started); - /* - * At this point the info structure may be out of scope unless wait==1 - */ - (*func)(info); - if (wait) { + if ( call_data->wait ) + { + (*func)(info); + mb(); + set_bit(smp_processor_id(), &call_data->finished); + } + else + { mb(); - atomic_inc(&call_data->finished); + set_bit(smp_processor_id(), &call_data->started); + (*func)(info); } } diff --git a/xen/arch/x86/x86_32/mm.c b/xen/arch/x86/x86_32/mm.c index 8c8897a283..ac1f10def5 100644 --- a/xen/arch/x86/x86_32/mm.c +++ b/xen/arch/x86/x86_32/mm.c @@ -180,22 +180,6 @@ void subarch_init_memory(struct domain *dom_xen) } } -/* - * Allows shooting down of borrowed page-table use on specific CPUs. - * Specifically, we borrow page tables when running the idle domain. - */ -static void __synchronise_pagetables(void *mask) -{ - struct exec_domain *ed = current; - if ( ((unsigned long)mask & (1 << ed->processor)) && - is_idle_task(ed->domain) ) - write_ptbase(ed); -} -void synchronise_pagetables(unsigned long cpu_mask) -{ - __synchronise_pagetables((void *)cpu_mask); - smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1); -} long do_stack_switch(unsigned long ss, unsigned long esp) { diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c index d0717bfab7..d5e9254925 100644 --- a/xen/arch/x86/x86_64/mm.c +++ b/xen/arch/x86/x86_64/mm.c @@ -236,23 +236,6 @@ void subarch_init_memory(struct domain *dom_xen) } } -/* - * Allows shooting down of borrowed page-table use on specific CPUs. - * Specifically, we borrow page tables when running the idle domain. - */ -static void __synchronise_pagetables(void *mask) -{ - struct exec_domain *ed = current; - if ( ((unsigned long)mask & (1 << ed->processor)) && - is_idle_task(ed->domain) ) - write_ptbase(ed); -} -void synchronise_pagetables(unsigned long cpu_mask) -{ - __synchronise_pagetables((void *)cpu_mask); - smp_call_function(__synchronise_pagetables, (void *)cpu_mask, 1, 1); -} - long do_stack_switch(unsigned long ss, unsigned long esp) { if ( (ss & 3) != 3 ) diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index d20a851e5d..2c9880d53d 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -266,7 +266,6 @@ long do_dom0_op(dom0_op_t *u_dom0_op) else { exec_domain_pause(ed); - synchronise_pagetables(~0UL); if ( ed->processor != (cpu % smp_num_cpus) ) set_bit(EDF_MIGRATED, &ed->ed_flags); set_bit(EDF_CPUPINNED, &ed->ed_flags); diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 4cae1d2a63..39f50983c2 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -534,8 +534,6 @@ void free_domheap_pages(struct pfn_info *pg, unsigned int order) { int i, drop_dom_ref; struct domain *d = page_get_owner(pg); - struct exec_domain *ed; - int cpu_mask = 0; ASSERT(!in_irq()); @@ -557,14 +555,11 @@ void free_domheap_pages(struct pfn_info *pg, unsigned int order) /* NB. May recursively lock from domain_relinquish_memory(). */ spin_lock_recursive(&d->page_alloc_lock); - for_each_exec_domain ( d, ed ) - cpu_mask |= 1 << ed->processor; - for ( i = 0; i < (1 << order); i++ ) { ASSERT((pg[i].u.inuse.type_info & PGT_count_mask) == 0); pg[i].tlbflush_timestamp = tlbflush_current_time(); - pg[i].u.free.cpu_mask = cpu_mask; + pg[i].u.free.cpu_mask = d->cpuset; list_del(&pg[i].list); } diff --git a/xen/common/schedule.c b/xen/common/schedule.c index 1ce8fa22d5..dddb96268a 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -192,7 +192,6 @@ void sched_add_domain(struct exec_domain *ed) void sched_rem_domain(struct exec_domain *ed) { - rem_ac_timer(&ed->timer); SCHED_OP(rem_task, ed); TRACE_3D(TRC_SCHED_DOM_REM, ed->domain->id, ed->eid, ed); diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h index d2d68e7cc2..17357bbcb4 100644 --- a/xen/include/asm-x86/mm.h +++ b/xen/include/asm-x86/mm.h @@ -205,12 +205,6 @@ static inline int get_page_and_type(struct pfn_info *page, int check_descriptor(struct desc_struct *d); -/* - * Use currently-executing domain's pagetables on the specified CPUs. - * i.e., stop borrowing someone else's tables if you are the idle domain. - */ -void synchronise_pagetables(unsigned long cpu_mask); - /* * The MPT (machine->physical mapping table) is an array of word-sized * values, indexed on machine frame number. It is expected that guest OSes diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h index bcc5db1d8d..145dc82a32 100644 --- a/xen/include/public/xen.h +++ b/xen/include/public/xen.h @@ -124,11 +124,11 @@ * ptr[:2] -- Machine address of new page-table base to install in MMU * when in user space. * - * val[7:0] == MMUEXT_TLB_FLUSH: - * No additional arguments. + * val[7:0] == MMUEXT_TLB_FLUSH_LOCAL: + * No additional arguments. Flushes local TLB. * - * val[7:0] == MMUEXT_INVLPG: - * ptr[:2] -- Linear address to be flushed from the TLB. + * val[7:0] == MMUEXT_INVLPG_LOCAL: + * ptr[:2] -- Linear address to be flushed from the local TLB. * * val[7:0] == MMUEXT_FLUSH_CACHE: * No additional arguments. Writes back and flushes cache contents. @@ -154,6 +154,12 @@ * val[7:0] == MMUEXT_REASSIGN_PAGE: * ptr[:2] -- A machine address within the page to be reassigned to the FD. * (NB. page must currently belong to the calling domain). + * + * val[7:0] == MMUEXT_TLB_FLUSH_MULTI: + * Flush TLBs of VCPUs specified in @mask. + * + * val[7:0] == MMUEXT_INVLPG_MULTI: + * ptr[:2] -- Linear address to be flushed from TLB of VCPUs in @mask. */ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ #define MMU_MACHPHYS_UPDATE 2 /* ptr = MA of frame to modify entry for */ @@ -164,8 +170,8 @@ #define MMUEXT_PIN_L4_TABLE 3 /* ptr = MA of frame to pin */ #define MMUEXT_UNPIN_TABLE 4 /* ptr = MA of frame to unpin */ #define MMUEXT_NEW_BASEPTR 5 /* ptr = MA of new pagetable base */ -#define MMUEXT_TLB_FLUSH 6 /* ptr = NULL */ -#define MMUEXT_INVLPG 7 /* ptr = VA to invalidate */ +#define MMUEXT_TLB_FLUSH_LOCAL 6 /* ptr = NULL */ +#define MMUEXT_INVLPG_LOCAL 7 /* ptr = VA to invalidate */ #define MMUEXT_FLUSH_CACHE 8 #define MMUEXT_SET_LDT 9 /* ptr = VA of table; val = # entries */ #define MMUEXT_SET_FOREIGNDOM 10 /* val[31:16] = dom */ @@ -173,6 +179,8 @@ #define MMUEXT_TRANSFER_PAGE 12 /* ptr = MA of frame; val[31:16] = dom */ #define MMUEXT_REASSIGN_PAGE 13 #define MMUEXT_NEW_USER_BASEPTR 14 +#define MMUEXT_TLB_FLUSH_MULTI 15 /* ptr = NULL; mask = VCPUs to flush */ +#define MMUEXT_INVLPG_MULTI 16 /* ptr = VA to inval.; mask = VCPUs */ #define MMUEXT_CMD_MASK 255 #define MMUEXT_CMD_SHIFT 8 @@ -180,6 +188,9 @@ #define UVMF_FLUSH_TLB 1 /* Flush entire TLB. */ #define UVMF_INVLPG 2 /* Flush the VA mapping being updated. */ +/* Backwards source compatibility. */ +#define MMUEXT_TLB_FLUSH MMUEXT_TLB_FLUSH_LOCAL +#define MMUEXT_INVLPG MMUEXT_INVLPG_LOCAL /* * Commands to HYPERVISOR_sched_op(). @@ -257,8 +268,9 @@ typedef u16 domid_t; */ typedef struct { - memory_t ptr; /* Machine address of PTE. */ - memory_t val; /* New contents of PTE. */ + memory_t ptr; /* Machine address of PTE. */ + memory_t val; /* New contents of PTE. */ + /*unsigned long mask;*/ /* VCPU mask (certain extended commands). */ } PACKED mmu_update_t; /* diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 3098586f17..b28a955b0f 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -143,6 +143,9 @@ struct domain struct exec_domain *exec_domain[MAX_VIRT_CPUS]; + /* Bitmask of CPUs on which this domain is running. */ + unsigned long cpuset; + struct arch_domain arch; }; @@ -250,6 +253,12 @@ void init_idle_task(void); void domain_wake(struct exec_domain *d); void domain_sleep(struct exec_domain *d); +/* + * Force loading of currently-executing domain state on the specified set + * of CPUs. This is used to counteract lazy state switching where required. + */ +void synchronise_lazy_execstate(unsigned long cpuset); + extern void context_switch( struct exec_domain *prev, struct exec_domain *next); @@ -330,14 +339,21 @@ static inline void exec_domain_pause(struct exec_domain *ed) ASSERT(ed != current); atomic_inc(&ed->pausecnt); domain_sleep(ed); + synchronise_lazy_execstate(ed->domain->cpuset & (1UL << ed->processor)); } static inline void domain_pause(struct domain *d) { struct exec_domain *ed; - for_each_exec_domain(d, ed) - exec_domain_pause(ed); + for_each_exec_domain( d, ed ) + { + ASSERT(ed != current); + atomic_inc(&ed->pausecnt); + domain_sleep(ed); + } + + synchronise_lazy_execstate(d->cpuset); } static inline void exec_domain_unpause(struct exec_domain *ed) @@ -351,7 +367,7 @@ static inline void domain_unpause(struct domain *d) { struct exec_domain *ed; - for_each_exec_domain(d, ed) + for_each_exec_domain( d, ed ) exec_domain_unpause(ed); } @@ -361,30 +377,26 @@ static inline void exec_domain_unblock(struct exec_domain *ed) domain_wake(ed); } -static inline void domain_unblock(struct domain *d) -{ - struct exec_domain *ed; - - for_each_exec_domain(d, ed) - exec_domain_unblock(ed); -} - static inline void domain_pause_by_systemcontroller(struct domain *d) { struct exec_domain *ed; - for_each_exec_domain(d, ed) { + for_each_exec_domain ( d, ed ) + { ASSERT(ed != current); if ( !test_and_set_bit(EDF_CTRLPAUSE, &ed->ed_flags) ) domain_sleep(ed); } + + synchronise_lazy_execstate(d->cpuset); } static inline void domain_unpause_by_systemcontroller(struct domain *d) { struct exec_domain *ed; - for_each_exec_domain(d, ed) { + for_each_exec_domain ( d, ed ) + { if ( test_and_clear_bit(EDF_CTRLPAUSE, &ed->ed_flags) ) domain_wake(ed); } diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h index 13e370cdca..f3f08127b6 100644 --- a/xen/include/xen/smp.h +++ b/xen/include/xen/smp.h @@ -43,8 +43,10 @@ extern void smp_commence(void); /* * Call a function on all other processors */ -extern int smp_call_function (void (*func) (void *info), void *info, - int retry, int wait); +extern int smp_call_function( + void (*func) (void *info), void *info, int retry, int wait); +extern int smp_subset_call_function( + void (*func) (void *info), void *info, int wait, unsigned long cpuset); /* * True once the per process idle is forked @@ -84,7 +86,8 @@ extern volatile int smp_msg_id; #define kernel_lock() #define cpu_logical_map(cpu) 0 #define cpu_number_map(cpu) 0 -#define smp_call_function(func,info,retry,wait) ({ 0; }) +#define smp_call_function(func,info,retry,wait) 0 +#define smp_subset_call_function(f,i,w,c) ({ if ( (c&1) ) (*f)(i); 0; }) #define cpu_online_map 1 #endif -- 2.30.2